####################################################
#Author: Kelli Marquardt
#Purpose: Simulate Fake patient level dataset for replication package, Mis(sed) Diagnosis: Physician Decision Making and ADHD 

# Inputs:

# Outputs:
#- data/est_dat_fake.csv

####################################################

############################
#0 load required packages
############################
rm(list = ls(all.names = TRUE))

required.packages = c("dplyr", "MASS")

missing=setdiff(required.packages, rownames(installed.packages()))

if (length(missing) > 0) {
  message("Installing missing packages: ", paste(missing, collapse = ", "))
  install.packages(missing, dependencies = TRUE)
} else {
  message("All required packages are already installed.")
}
rm(missing, required.packages)

#load packages
library(MASS)
library(dplyr) 


####################################
# set seed for replication
seed_for_fake_data=12345
set.seed(seed_for_fake_data)
####################################



######################################
#step 1: define helper functions  
######################################

#force vector to be between lo and hi
clamp = function(x, lo, hi) pmin(pmax(x, lo), hi)

#random uniform with specific mean and bounds
runif_shifted_mean = function(n, lo, hi, target_mean = NA_real_) {
  u = runif(n, lo, hi)
  if (is.na(target_mean)) return(u)
  mid = (lo + hi) / 2
  delta = target_mean - mid
  clamp(u + delta, lo, hi)
}

#random Bernoulli with specific prob
rbern = function(p) rbinom(length(p), size = 1, prob = p)

#random negative binomial with stated mean, sd, and bounds
rnegbinom_trunc = function(n, mean, sd, lo, hi) {
  size=mean^2/(sd^2-mean)
  x=rnbinom(n, size=size, mu=mean)
  clamp(x, lo, hi)
}

#shrink an integer x to something in lo_bound to x 
shrink_int_leq = function(x, lo_bound) {
  as.integer(floor(runif(length(x), min = lo_bound, max = x + 1)))
}


######################################
#step 2: note the target values from tables in paper
######################################

n_patients = 10000 #number of patients
p_male = 0.508 #% male (table 2)

# mu, sig, cbar, rho, tau (table 6)
male_param_vec = c(.208, .501, 1.027, .397, .484)
female_param_vec = c(.162, .486, 1.167, .484, .649)
names(male_param_vec) = names(female_param_vec)=c("mu","sig","c_bar", "rho", "tau")


# Other values from Table 2
p_hispanic = 0.494
p_white    = 0.561
p_medicaid = .535
mean_age = 10.316
age_lo = 5
age_hi = 18
mean_numdocs = 1.933  
sd_numdocs = 1.502
numdocs_lo = 1
numdocs_hi = 15
mean_numapt  = 3.269 
sd_numapt = 4.111
numapt_lo = 1
numapt_hi = 85
mean_numapt_notpcp_first = 2.542  
sd_numapt_notpcp_first = 3.849
mean_yr_insample = 1.69
yr_insample_lo = 1
yr_insample_hi = 4

# Values from Table A1 (private insurance)
p_privins=mean(.416,.426) #average of the male and female means 

# Values from Table A8 
p_external_given_Q1   = 0.026
p_internal_given_Q1 = .215

# Values from Table B1
n_diag_docs= 159
n_first_pcp = 100 + 20 #note, add 20 to account for those who drop in first stage of estimation using simulated IPCP ids
p_doc_overlap = .28

# Values from Figure C2 (rough approx. for sd and max)
note_length_D1_mean=375
note_length_D1_sd=225
note_length_D1_max=1000
note_length_D0_mean=480
note_length_D0_sd=260
note_length_D0_max=1250

#  Values from Table C2 (correlations between xi_60 and c(xi_01, xi_30, xi_all))
xi_cor_mean=c(.986, .993,.934)


# Other values needed for simulation but not in paper table/figure 
p_sip_pcp= .9 #90% of patients have a visit with an SIP that is also PCP (see appendix B2)
p_type2max =  0.88 #probability that type2 match is the maximum
birth_year_mean = 2005
birth_year_lo = 1999
birth_year_hi = 2012
p_psych_doc = 0.07 #prob. visit with psych doc specialty 
p_well      = 0.06 #prob. visit with wellness-related descriptor
p_behav     = 0.05 #prob. visit with behavior-related descriptor 
sd_pcp_shift=.4 #pick sd of the PCP cost shifters (less than 1)



######################################
#step 3: build the base data frame with model-based variables
######################################

#first start with a function that returns Qi, Di, and xi given pcp_shifter
simulateData = function(param_vec, N, pcp_id, pcp_shift) {
  stopifnot(length(pcp_id) == N, length(pcp_shift) == N)
  
  mu_s   = param_vec["mu"]
  sig_s  = param_vec["sig"]
  cbar_s = param_vec["c_bar"]
  rho_s  = param_vec["rho"]
  tau_s  = param_vec["tau"]
  
  # simulate (vi, xi) with corr rho and same mean/var
  Mu_sim = c(mu_s, mu_s)
  Sigma2 = matrix(c(sig_s^2, rho_s*sig_s^2,
                     rho_s*sig_s^2, sig_s^2), 2, 2, byrow=TRUE)
  
  vx = mvrnorm(N, Mu_sim, Sigma2)
  vi = vx[,1]
  xi_full = vx[,2]
  
  # selection threshold: ci = cbar + pcp_shift + e, where pcp_shift + e ~ N(0,1)
  e  = rnorm(N, 0, sqrt((1-sd_pcp_shift^2)))
  ci = cbar_s + pcp_shift + e
  Qi = as.integer(vi > ci)
  
  
  # diagnosis model 
  ui = runif(N, 0, 1)
  probDi_x = pnorm((rho_s*xi_full + (1-rho_s)*mu_s - tau_s) /
                      (sig_s * sqrt(1 - rho_s^2)))
  Di = ifelse(Qi==1 & probDi_x > ui, 1, 0)
  
  data.frame(
    Qi = Qi,
    Di = Di,
    xi = ifelse(Qi==1, xi_full, NA) )
}

### build skeleton with pat_id and male first
dat = data.frame(
  pat_id = 1:n_patients,
  male   = rbinom(n_patients, 1, p_male)
)

# PCP ids 
set.seed(seed_for_fake_data+1)
pcp_pool = sprintf("%05d", sample(0:99999, size = (n_first_pcp), replace = FALSE))

# assign PCP ensuring every PCP appears in each gender pair at least 4 times
dat$first_pcp_id = NA
idx_m = which(dat$male == 1)
idx_f = which(dat$male == 0)

# make sure at least 4*n_first_pcp in each gender (true with n=10000)
dat$first_pcp_id[idx_m[1:(4*n_first_pcp)]] = rep(pcp_pool,4)
dat$first_pcp_id[idx_f[1:(4*n_first_pcp)]] = rep(pcp_pool,4)

# remaining get random PCPs
set.seed(seed_for_fake_data+2)
dat$first_pcp_id[is.na(dat$first_pcp_id) & dat$male==1] = sample(pcp_pool, sum(is.na(dat$first_pcp_id) & dat$male==1), replace=TRUE)
dat$first_pcp_id[is.na(dat$first_pcp_id) & dat$male==0] = sample(pcp_pool, sum(is.na(dat$first_pcp_id) & dat$male==0), replace=TRUE)


# create pcp_shift values that will be used to influence ci in simulation
set.seed(seed_for_fake_data+3)
pcp_shift_table = expand.grid(first_pcp_id = pcp_pool, male = c(0,1)) %>%
  mutate(pcp_shift = rnorm(n(), mean = 0,
                           sd = sd_pcp_shift))

dat = dat %>% left_join(pcp_shift_table, by = c("first_pcp_id","male"))

#now simulate male and female values for xi, Qi, Di 
set.seed(seed_for_fake_data+4)
idx_m = which(dat$male==1)
male_sim = simulateData(
  male_param_vec,
  N = length(idx_m),
  pcp_id = dat$first_pcp_id[idx_m],
  pcp_shift = dat$pcp_shift[idx_m]
)
male_sim$male = 1
male_sim$pat_id = dat$pat_id[idx_m]
male_sim$first_pcp_id = dat$first_pcp_id[idx_m]

idx_f = which(dat$male==0)
female_sim = simulateData(
  female_param_vec,
  N = length(idx_f),
  pcp_id = dat$first_pcp_id[idx_f],
  pcp_shift = dat$pcp_shift[idx_f]
)
female_sim$male = 0
female_sim$pat_id = dat$pat_id[idx_f]
female_sim$first_pcp_id = dat$first_pcp_id[idx_f]

# combine
dat = rbind(male_sim, female_sim) %>% as.data.frame()

######################################
#step 4: add controls
######################################
# add base controls
set.seed(seed_for_fake_data+5)
dat = dat %>%
  mutate(
    age_mean = runif_shifted_mean(n(), age_lo, age_hi, mean_age),
    numapt   = rnegbinom_trunc(n(), mean = mean_numapt, sd = sd_numapt,
                               lo = numapt_lo, hi = numapt_hi),

    # race/ethnicity 
    hispanic = rbinom(n = n(), size = 1, prob = p_hispanic),
    white    = rbinom(n = n(), size = 1, prob = p_white),
    
    # insurance group categories
    ins_group = sample(
      x = c("medicaid", "commercial", "other"),
      size = n(), 
      replace = TRUE,
      prob = c(p_medicaid, p_privins, 1 - p_medicaid - p_privins)
    ),
    
    # birth_year integer
    birth_year = as.integer(
      round(runif_shifted_mean(n(),
                               lo = birth_year_lo,
                               hi = birth_year_hi,
                               target_mean = birth_year_mean))
    ),
    birth_year = clamp(birth_year, birth_year_lo, birth_year_hi),
    
    # psych_doc / well / behav 
    psych_doc = rbinom(n = n(), size = 1, prob = p_psych_doc),
    well      = rbinom(n = n(), size = 1, prob = p_well),
    behav     = rbinom(n = n(), size = 1, prob = p_behav),
    
    # mh_dx_other_* must be 0 if Qi==0
    mh_dx_other_internal = case_when(
      Qi == 0 ~ 0,
      TRUE      ~ rbern(rep(p_internal_given_Q1, n()))
    ),
    mh_dx_other_external = case_when(
      Qi == 0 ~ 0,
      TRUE      ~ rbern(rep(p_external_given_Q1, n()))
    ),
    
    # numdocs in [1,15]
    numdocs = as.integer(rnegbinom_trunc(n(), mean = mean_numdocs,
                                         sd= sd_numdocs, 
                                     lo = numdocs_lo, hi = numdocs_hi)),
    
    # numapt_notpcp_first (must be less than numapt) 
    numapt_notpcp_first =as.integer(pmin(rnegbinom_trunc(n(), 
                                                mean=mean_numapt_notpcp_first, 
                                                sd=sd_numapt_notpcp_first, 
                                                lo=0, hi=numapt_hi),
                                numapt)),
    

    #indicator for type2 being the max type xi=xi_type2=max(xi_type1, xi_type2)
      type2_max=ifelse(is.na(xi), NA, 
                              rbinom(n(), size=1, prob=p_type2max)),
    
    
    
    #varying xi inclusion window variables 
    xi_01= case_when(
      Qi==0 ~ NA, 
      Di==0 ~ xi, 
      T ~ xi_cor_mean[1]*xi+sqrt((1-xi_cor_mean[1]^2))*rnorm(n(), 0, 1)),

    xi_30= case_when(
      Qi==0 ~ NA, 
      Di==0 ~ xi, 
      T ~ xi_cor_mean[2]*xi+sqrt((1-xi_cor_mean[2]^2))*rnorm(n(), 0, 1)),
    
    xi_60= xi,
    
    xi_all= ifelse(is.na(xi), NA, 
                  xi_cor_mean[3]*xi+sqrt((1-xi_cor_mean[3]^2))*rnorm(n(), 0, 1)),

    
    #note length 
    note_length = case_when(
      Qi == 0 ~ NA,
      Di == 1 ~ rnegbinom_trunc(n(), 
                                mean=note_length_D1_mean, 
                                sd=note_length_D1_sd, 
                                lo=1, hi=note_length_D1_max), 
      T ~ rnegbinom_trunc(n(), 
                          mean=note_length_D0_mean, 
                          sd=note_length_D0_sd, 
                          lo=1, hi=note_length_D0_max)),
    
    #indicator for whether the patient was pulled into sample via a SIP that is also a PCP
    sip_is_pcp=rbinom(n = n(), size = 1, prob = p_sip_pcp)
    
    )%>%
      as.data.frame()
    


######################################
# step 5: add year_2014-year_2017 so they sum to numapt
######################################

n = nrow(dat)

# draw intended years-in-sample with mean ~1.70 
set.seed(seed_for_fake_data+6)
k_draw = sample(1:4, size = n, replace = TRUE, prob = c(0.52, 0.31, 0.12, 0.05))

# must have k <= numapt (since each nonzero year must get at least 1 appt)
k = pmin(k_draw, dat$numapt, 4)

# initialize year columns
dat$year_2014 = 0
dat$year_2015 = 0
dat$year_2016 = 0
dat$year_2017 = 0

years_vec = c(2014, 2015, 2016, 2017)

for (i in seq_len(n)) {
  ki = k[i]
  Ni = dat$numapt[i]
  
  # choose which years are "in sample"
  active_years = sample(years_vec, size = ki, replace = FALSE)
  
  # give each active year at least 1 appointment, then distribute remainder
  rem = Ni - ki
  if (rem > 0) {
    add = as.integer(rmultinom(1, size = rem, prob = rep(1/ki, ki)))
  } else {
    add = rep(0, ki)
  }
  vals = 1 + add
  
  # assign into the right year_* columns
  for (j in seq_len(ki)) {
    y = active_years[j]
    if (y == 2014) dat$year_2014[i] = vals[j]
    if (y == 2015) dat$year_2015[i] = vals[j]
    if (y == 2016) dat$year_2016[i] = vals[j]
    if (y == 2017) dat$year_2017[i] = vals[j]
  }
}




######################################
# step 6: add diag_doc
# - 5-digit string
# - NA when Di==0
# - 159 unique values total
# - overlap: 28 of the 100 first_pcp_id values are also used as diag_doc ids
######################################
set.seed(seed_for_fake_data+7)
pcp_pool = sort(unique(dat$first_pcp_id))

# choose the overlapping doc IDs: 28 PCP ids reused as diag_doc ids
n_overlap = round(p_doc_overlap * n_first_pcp)  
diag_overlap_ids = sample(pcp_pool, size = n_overlap, replace = FALSE)

# create the remaining doc IDs as new 5-digit codes not in pcp_pool
n_new = n_diag_docs - n_overlap

all5 = sprintf("%05d", 0:99999)

diag_new_ids = sample(setdiff(all5, pcp_pool), size = n_new, replace = FALSE)
diag_pool = c(diag_overlap_ids, diag_new_ids)

# assign diag_doc only for Di==1
dat$diag_doc = NA_character_
idx_D1 = which(dat$Di == 1)
dat$diag_doc[idx_D1] = sample(diag_pool, size = length(idx_D1), replace = TRUE)



######################################
# step 7: define *_uptoQ1 variables
# Rule:
#  - if Qi==0: var_uptoQ1 == var
#  - if Qi==1: var_uptoQ1 <= var   (weakly)
######################################
set.seed(seed_for_fake_data+8)
# year_*_uptoQ1
for (yr in 2014:2017) {
  v = paste0("year_", yr)
  v_u = paste0(v, "_uptoQ1")
  
  dat[[v_u]] = dat[[v]]  # default: Qi==0 stays equal
  
  idx1 = which(dat$Qi == 1)
    # keep exact when original is 0; otherwise shrink to [0, original]
    x = dat[[v]][idx1]
    dat[[v_u]][idx1] = ifelse(x == 0, 0, shrink_int_leq(x, lo_bound = 0))
  
}

# age_mean_uptoQ1
dat$age_mean_uptoQ1 = dat$age_mean

idx1 = which(dat$Qi == 1)
  # subtract a nonnegative amount, but never let age drop below 5
  delta_age = runif(length(idx1), min = 0, max = 2.0)
  dat$age_mean_uptoQ1[idx1] =
    pmax(dat$age_mean[idx1] - delta_age, age_lo)  # age_lo = 5

# psych_doc / well / behav uptoQ1 (binary): if Qi==1, can only turn 1->0, keep 0->0
for (v in c("psych_doc", "well", "behav")) {
  v_u = paste0(v, "_uptoQ1")
  dat[[v_u]] = dat[[v]]
  idx1 = which(dat$Qi == 1 & dat[[v]] == 1)
    # fraction of time it already happened before Q=1
    keep_prob = 0.6   
    dat[[v_u]][idx1] = rbinom(length(idx1), size = 1, prob = keep_prob)
}

# numdocs_uptoQ1 (count): if Qi==1, shrink to <= numdocs, keep >=1
dat$numdocs_uptoQ1 = dat$numdocs
idx1 = which(dat$Qi == 1)
  x = dat$numdocs[idx1]
  # keep exact when x==1; otherwise shrink to [1, x]
  dat$numdocs_uptoQ1[idx1] = ifelse(x <= 1, x, shrink_int_leq(x, lo_bound = 1))


######################################
# step 8: other variables needed
######################################

#of those with Di=1, need ~ 95 to have multiple assessments, the rest 0 
dat$mult_assessments=0
idx1 = which(dat$Di == 1)
idx1= idx1[1:95]
dat$mult_assessments[idx1]=1
  
  
#############################################################
#Step 9: clean up and save 
#############################################################


dat= dat%>%
  mutate(Med=as.integer(ins_group=="medicaid"),
         Com=as.integer(ins_group=="commercial"))%>%
  select(pat_id, male, hispanic, white, Med, Com, birth_year,
         first_pcp_id, diag_doc, 
         Qi, xi, Di, type2_max, xi_01, xi_30, xi_60, xi_all, note_length,mult_assessments, sip_is_pcp,   
         age_mean, psych_doc, well, behav, numdocs, numapt, numapt_notpcp_first, 
         mh_dx_other_external, mh_dx_other_internal, 
         year_2014, year_2015, year_2016, year_2017, 
         age_mean_uptoQ1, psych_doc_uptoQ1, well_uptoQ1, behav_uptoQ1, numdocs_uptoQ1, 
         year_2014_uptoQ1, year_2015_uptoQ1, year_2016_uptoQ1, year_2017_uptoQ1)%>%
  as.data.frame()



#################################################################################
# save
#################################################################################

write.csv(dat, "data/est_dat_fake.csv", row.names = F)


#END OF SCRIPT
